package net.oschina.htmlsucker;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;

import java.io.IOException;
import java.net.URL;

/**
 *  <p>HtmlSucker 的入口</p>
 *  <p>使用方法:</p>
 *  <code>
 *  HtmlSucker.select(HtmlSucker.MAX_TEXT_EXTRACTOR).parse(html);
 *  </code>
 *  @author Winter Lau (javayou@gmail.com)
 */
public class HtmlSucker {

    public final static byte MAX_TEXT_EXTRACTOR     = 0x01; //最大文本长度抽取
    public final static byte TEXT_DENSITY_EXTRACTOR = 0x02; //文本密度算法抽取

    public static void main(String[] args) throws IOException {
        String url = "https://mp.weixin.qq.com/s/fFcFlgdtK9ebW3TtCcIi5Q";
        String html = "<html><head><title>Test</title></head><body><div class='article'><h1>开源中国社区</h1><div class='content'><img src='http://www.oschina.net/logo.gif'/><ul><li>Hello</li></ul><p>Many languages support default arguments for methods and constructors out of the box, i.e. Scala:</p><p>The sum method can be invoked as follows:</p><p>This is very handy, but Java doesn't support it. There are a few different ways to accomplish something similar, however all of them have some drawback.</p><div class='copyright'>oschina</div></div></div></body></html>";
        System.out.println(HtmlSucker.select(TEXT_DENSITY_EXTRACTOR).parse(url, 20000));
    }

    /**
     * 选择不同的算法
     * @param extrator
     * @return
     */
    public final static HtmlSucker select(byte extrator) {
        ContentExtractor extractor = null;
        switch(extrator){
        case MAX_TEXT_EXTRACTOR:
            extractor = new MaxTextContentExtractor();
            break;
        case TEXT_DENSITY_EXTRACTOR:
            extractor = new TextDensityExtractor();
            break;
        default:
            throw new IllegalArgumentException("Illegal Extractor defined: value = "+extractor);
        }
        return new HtmlSucker(extractor);
    }

    private ContentExtractor extractor;

    private HtmlSucker(ContentExtractor extractor) {
        this.extractor = extractor;
    }

    /**
     * 根据 URL 来解析文章信息
     * @param url
     * @return
     */
    public Article parse(String url, int timeMillis) throws IOException {
        return parse(Jsoup.parse(new URL(url), timeMillis));
    }

    /**
     * 根据 html 内容来解析文章信息
     * @param html
     * @return
     */
    public Article parse(String html) {
        return parse(Jsoup.parse(html));
    }

    private Article parse(Document doc) {
        Article art = new Article();
        art.setTitle(MetadataExtractor.title(doc));
        art.setDescription(MetadataExtractor.description(doc));
        art.setKeywords(MetadataExtractor.keywords(doc));
        art.setAuthor(MetadataExtractor.author(doc));
        art.setDate(MetadataExtractor.date(doc));
        art.setImage(MetadataExtractor.image(doc));

        //开始解析内容
        art.setContent(extractor.content(doc.body()));

        if(art.getImage()==null || art.getImage().isEmpty()) {
            Document body = Jsoup.parse(art.getContent());
            Element img = body.select("img").first();
            if(img != null) {
                String src = img.attr("abs:src");
                if(src == null || src.isEmpty()) {
                    src = img.attr("data-src");
                }
                art.setImage(src);
            }
        }

        return art;
    }

}
